tutorials/025 - Redshift - Loading Parquet files with Spectrum.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[![AWS SDK for pandas](_static/logo.png \"AWS SDK for pandas\")](https://github.com/aws/aws-sdk-pandas)\n", "\n", "# 25 - Redshift - Loading Parquet files with Spectrum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your bucket name:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install the optional modules first\n", "!pip install 'awswrangler[redshift]'" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " ···········································\n" ] } ], "source": [ "import getpass\n", "\n", "bucket = getpass.getpass()\n", "PATH = f\"s3://{bucket}/files/\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mocking some Parquet Files on S3" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>col0</th>\n", " <th>col1</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>b</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>e</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>5</td>\n", " <td>f</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>6</td>\n", " <td>g</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>7</td>\n", " <td>h</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>8</td>\n", " <td>i</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>9</td>\n", " <td>j</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " col0 col1\n", "0 0 a\n", "1 1 b\n", "2 2 c\n", "3 3 d\n", "4 4 e\n", "5 5 f\n", "6 6 g\n", "7 7 h\n", "8 8 i\n", "9 9 j" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "import awswrangler as wr\n", "\n", "df = pd.DataFrame(\n", " {\n", " \"col0\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", " \"col1\": [\"a\", \"b\", \"c\", \"d\", \"e\", \"f\", \"g\", \"h\", \"i\", \"j\"],\n", " }\n", ")\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "wr.s3.to_parquet(df, PATH, max_rows_by_file=2, dataset=True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Crawling the metadata and adding into Glue Catalog" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "({'col0': 'bigint', 'col1': 'string'}, None, None)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.s3.store_parquet_metadata(path=PATH, database=\"aws_sdk_pandas\", table=\"test\", dataset=True, mode=\"overwrite\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running the CTAS query to load the data into Redshift storage" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "con = wr.redshift.connect(connection=\"aws-sdk-pandas-redshift\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "query = \"CREATE TABLE public.test AS (SELECT * FROM aws_sdk_pandas_external.test)\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "with con.cursor() as cursor:\n", " cursor.execute(query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Running an INSERT INTO query to load MORE data into Redshift storage" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(\n", " {\n", " \"col0\": [10, 11],\n", " \"col1\": [\"k\", \"l\"],\n", " }\n", ")\n", "wr.s3.to_parquet(df, PATH, dataset=True, mode=\"overwrite\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "query = \"INSERT INTO public.test (SELECT * FROM aws_sdk_pandas_external.test)\"" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "with con.cursor() as cursor:\n", " cursor.execute(query)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Checking the result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "query = \"SELECT * FROM public.test\"" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>col0</th>\n", " <th>col1</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5</td>\n", " <td>f</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>b</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>6</td>\n", " <td>g</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8</td>\n", " <td>i</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>10</td>\n", " <td>k</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>4</td>\n", " <td>e</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>2</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>7</td>\n", " <td>h</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>9</td>\n", " <td>j</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>11</td>\n", " <td>l</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " col0 col1\n", "0 5 f\n", "1 1 b\n", "2 3 d\n", "3 6 g\n", "4 8 i\n", "5 10 k\n", "6 4 e\n", "7 0 a\n", "8 2 c\n", "9 7 h\n", "10 9 j\n", "11 11 l" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wr.redshift.read_sql_table(con=con, schema=\"public\", table=\"test\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "con.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.14", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.14" }, "pycharm": { "stem_cell": { "cell_type": "raw", "metadata": { "collapsed": false }, "source": [] } } }, "nbformat": 4, "nbformat_minor": 4 }

tutorials/025 - Redshift - Loading Parquet files with Spectrum.ipynb (463 lines of code) (raw):